HDBSCAN (Hierarchical Density-Based Spatial Clustering of Applications with Noise) is a density-based clustering algorithm that improves upon DBSCAN by allowing the detection of clusters with varying densities. It is particularly effective for discovering clusters in datasets with complex structures or noise.
-1 by default).min_cluster_size: Defines the minimum size of clusters.min_samples: Controls how conservative the clustering is.Below I explore using HDBSCAN on word embedding vectors to see how well it can cluster similar terms.
from numpy import dot
from numpy.linalg import norm
from openai import AzureOpenAI
from openai import OpenAI
import numpy as np
import pandas as pd
from sklearn.manifold import TSNE
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import hdbscan
from hdbscan import BranchDetector
import os
import plotly.io as pio
import plotly.offline as pyo
# load environment vars
import dotenv
dotenv.load_dotenv(override=True)
#nlp = spacy.load('en_core_web_sm')
open_ai_api_key = os.getenv("OPEN_AI_API_KEY")
endpoint = os.getenv("AZURE_ENDPOINT")
client = AzureOpenAI(
api_key = open_ai_api_key,
api_version = '2023-07-01-preview',
azure_endpoint = endpoint
)
def generate_embeddings(text, model="text-embedding-ada-002"):
"""This generates embeddings"""
try:
return client.embeddings.create(input = [text], model=model).data[0].embedding
except:
return None
# Function to calculate cosine similarity
def calculate_cosine_similarity(vec1, vec2):
return cosine_similarity([vec1], [vec2])[0][0]
term_to_class = {
# Cluster 0 (Terrestrial animals)
'cat': 0,
'dog': 0,
'deer': 0,
'lion': 0,
'wolf': 0,
'bear': 0,
# Cluster 1 (Cephalopods)
'octopus': 1,
'squid': 1,
# Cluster 2 (Reptiles)
'alligator': 2,
'crocodile': 2,
'lizard': 2,
'iguana': 2,
'gecko': 2,
# Cluster 3 (aquatic animals)
'tuna':3,
'rockfish':3,
'salmon':3,
'goldfish':3,
'swordfish':3,
'herring':3,
'dolphin':3
}
# Updated terms list to match the expanded dictionary
terms = list(term_to_class.keys())
# Assuming `generate_embeddings` is a function that generates embeddings for each term
evs = {}
# Populate the evs dictionary with embeddings for each term
for term in terms:
evs[term] = generate_embeddings(term)
evs.keys()
dict_keys(['cat', 'dog', 'deer', 'lion', 'wolf', 'bear', 'octopus', 'squid', 'alligator', 'crocodile', 'lizard', 'iguana', 'gecko', 'tuna', 'rockfish', 'salmon', 'goldfish', 'swordfish', 'herring', 'dolphin'])
evs_list = [evs[term] for term in terms]
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
def hdbscan_cluster_plot(evs, terms, term_to_class_dict, hdbscan_dist_metric, alpha=.001, leaf_size=1, min_cluster_size=2, min_samples=1):
# Reduce embeddings to 2D using t-SNE
tsne_model = TSNE(n_components=2, random_state=42, perplexity=1, learning_rate=.001, max_iter=10000)
reduced_embeddings = tsne_model.fit_transform(np.array(evs))
# Initialize HDBSCAN clusterer
clusterer = hdbscan.HDBSCAN(algorithm='best',
alpha=alpha,
gen_min_span_tree=True,
cluster_selection_epsilon=.1,
leaf_size=leaf_size,
metric=hdbscan_dist_metric,
min_cluster_size=min_cluster_size,
min_samples=min_samples,
branch_detection_data=True,
p=None)
#branch_detector = BranchDetector(min_branch_size=1).fit(clusterer)
# Fit the model and get cluster labels
cluster_labels = clusterer.fit_predict(np.array(evs))
# Extract true labels using the term_to_class_dict
true_labels = [term_to_class_dict.get(term, -1) for term in terms]
# Calculate classification accuracy using sklearn's metrics
ari = adjusted_rand_score(true_labels, cluster_labels)
nmi = normalized_mutual_info_score(true_labels, cluster_labels)
# Prepare the data for Plotly graph objects
x_vals = reduced_embeddings[:, 0] # X coordinates from t-SNE
y_vals = reduced_embeddings[:, 1] # Y coordinates from t-SNE
# Define colors for each cluster, defaulting -1 (noise) to grey
unique_labels = list(set(cluster_labels))
# Create a scatter plot with Plotly graph objects
fig = go.Figure()
term_classes = []
# Add traces for each unique cluster
for label in unique_labels:
cluster_points = np.array([reduced_embeddings[i] for i in range(len(cluster_labels)) if cluster_labels[i] == label])
cluster_terms = [terms[i] for i in range(len(cluster_labels)) if cluster_labels[i] == label]
for cluster_term in cluster_terms:
term_classes.append((label, cluster_term))
# Differentiate the color for noise points (label == -1)
marker_color = 'grey' if label == -1 else f'rgb({np.random.randint(0, 255)}, {np.random.randint(0, 255)}, {np.random.randint(0, 255)})'
# Add cluster trace to the figure
fig.add_trace(go.Scatter(
x=cluster_points[:, 0],
y=cluster_points[:, 1],
mode='markers+text',
name=f'Cluster {label}' if label != -1 else 'Noise',
marker=dict(color=marker_color, size=10, line=dict(width=1)),
text=cluster_terms, # Show terms for each point
textposition='top center' # Adjust text position
))
# Set the layout for the plot
fig.update_layout(
title=f"HDBSCAN Clusters on t-SNE with ARI: {ari:.2f}, NMI: {nmi:.2f}",
xaxis_title="t-SNE Dim 1",
yaxis_title="t-SNE Dim 2",
legend_title="Clusters",
width=900,
height=800
)
return fig, term_classes, ari, nmi
fig, term_classes, ari, nmi = hdbscan_cluster_plot(evs=evs_list,
terms=terms,
term_to_class_dict=term_to_class,
hdbscan_dist_metric='manhattan')
fig.show()
fig, term_classes, ari, nmi = hdbscan_cluster_plot(evs=evs_list,
terms=terms,
term_to_class_dict=term_to_class,
hdbscan_dist_metric='euclidean')
fig.show()
fig, term_classes, ari, nmi = hdbscan_cluster_plot(evs=evs_list,
terms=terms,
term_to_class_dict=term_to_class,
hdbscan_dist_metric='euclidean',
alpha=.1)
fig.show()
fig, term_classes, ari, nmi = hdbscan_cluster_plot(evs=evs_list,
terms=terms,
term_to_class_dict=term_to_class,
hdbscan_dist_metric='euclidean',
alpha=.1,
leaf_size=1, min_cluster_size=5, min_samples=10)
fig.show()